The (interactive) correlation heatmap reveals very high correlation among aminoacids and TG compounds.
load("data/data.Rdata")
X <- df[df$Diagnosis == "Probable AD", 1:230]
C <- round(cor(X), 2)
heatmaply_cor(C, color = "magma", plot_method = "plotly", dendrogram = F, reorderfun = sort.default(d, w), main = "Correlation Heatmap", file = "heatmap.html", colorbar_thickness = 15, colorbar_len = 0.5)
Performing a Global Test of ApoE4 dose-effects, on serum metabolites of AD patients, correcting for sex (Ho: ApoE4 dose has no effect on mean metabolite levels, Ha: it has an effect), yields a significant difference (p = 0.0171). The most significantly affected metabolites are triglycerides and diglycerides (FDR-adjusted p-value <0.05).
AD <- subset(df, Diagnosis == "Probable AD")
AD$E4dose <- as.factor(AD$E4dose)
gt <- globaltest::gt(E4dose ~ 1, E4dose ~ . - E4 - Diagnosis - target, data = AD, standardize = TRUE)
gt
p-value Statistic Expected Std.dev #Cov
1 0.0171 1.23 0.8 0.168 232
covariates <- covariates(gt, sort = T)
res.gt <- extract(covariates) %>%
sort() %>%
p.adjust(, method = "FDR") %>%
globaltest::result() %>%
mutate_if(is.numeric, round, digits = 3)
res.gt$direction <- as.factor(res.gt$direction)
levels(res.gt$direction) <- c("no ApoE4", "1 ApoE4", "2 ApoE4")
res.gt$alias <- NULL
res.gt <- res.gt[, 1:4]
names(res.gt) <- c("Inheritance", "Assoc. with", "FDR", "p-value")
res.gt <- res.gt[, c(1:(ncol(res.gt) - 2), ncol(res.gt), (ncol(res.gt) - 1))]
filter(res.gt, `p-value` < 0.05)
Inheritance Assoc. with p-value FDR
Lip.TG.56.2. 0.046 1 ApoE4 0.000 0.016
Lip.TG.58.1. 0.117 1 ApoE4 0.000 0.021
Lip.DG.36.3. 0.190 1 ApoE4 0.000 0.021
Lip.TG.56.3. 0.244 1 ApoE4 0.000 0.021
Lip.TG.52.3. 0.424 1 ApoE4 0.001 0.031
Lip.TG.54.5. 0.480 1 ApoE4 0.001 0.027
Lip.TG.58.2. 0.722 1 ApoE4 0.001 0.027
Lip.TG.54.4. 0.761 1 ApoE4 0.002 0.033
Lip.TG.58.9. 1.000 1 ApoE4 0.001 0.027
Lip.TG.56.7. 1.000 1 ApoE4 0.001 0.027
Lip.TG.58.8. 1.000 1 ApoE4 0.001 0.032
Lip.TG.54.6. 1.000 1 ApoE4 0.002 0.035
Lip.TG.56.8. 1.000 1 ApoE4 0.002 0.037
Lip.TG.52.4. 1.000 1 ApoE4 0.003 0.055
Lip.TG.54.3. 1.000 1 ApoE4 0.004 0.055
Lip.TG.56.6. 1.000 1 ApoE4 0.004 0.059
Lip.TG.56.1. 1.000 1 ApoE4 0.004 0.059
Lip.TG.52.2. 1.000 1 ApoE4 0.005 0.063
Lip.TG.54.2. 1.000 1 ApoE4 0.005 0.063
Lip.TG.60.2. 1.000 1 ApoE4 0.007 0.077
Lip.TG.58.10. 1.000 1 ApoE4 0.011 0.124
Lip.TG.51.3. 1.000 1 ApoE4 0.015 0.154
Lip.SM.d18.1.18.1. 1.000 2 ApoE4 0.018 0.169
OA.OA09...2.ketoglutaric.acid 1.000 2 ApoE4 0.019 0.169
Lip.TG.52.0. 1.000 1 ApoE4 0.019 0.169
Lip.TG.50.3. 1.000 2 ApoE4 0.020 0.169
OA.OA29...Uracil 1.000 no ApoE4 0.020 0.169
Lip.TG.52.5. 1.000 1 ApoE4 0.021 0.174
Lip.TG.54.0. 1.000 1 ApoE4 0.022 0.174
Lip.TG.50.2. 1.000 2 ApoE4 0.022 0.174
Lip.TG.51.2. 1.000 1 ApoE4 0.023 0.174
Am.L.Glutamine 1.000 no ApoE4 0.024 0.174
Lip.TG.50.1. 1.000 2 ApoE4 0.025 0.174
Lip.TG.50.4. 1.000 1 ApoE4 0.026 0.176
Lip.TG.52.1. 1.000 1 ApoE4 0.027 0.176
Lip.TG.54.1. 1.000 1 ApoE4 0.031 0.203
Lip.TG.50.0. 1.000 1 ApoE4 0.037 0.229
OA.OA08...Malic.acid 1.000 2 ApoE4 0.038 0.234
Lip.DG.36.2. 1.000 1 ApoE4 0.039 0.235
OS.HpH.PAF.C16.0 1.000 2 ApoE4 0.049 0.283
Metabolite-level models to test for ApoE4 dose effects, correcting for age at diagnosis, sex, smoking status, alcohol consumption, hypertension (and medication), hyperlipidemia (and medication), anticoagulant medication, anti-depressants, mean arterial pressure (MAP) and body mass index (BMI). The F-tests yielded p-values < 0.05, however none of them survived FDR correction. Interestingly, in the SCD group, aminoacids are mostly affected and no TGs or DGs compared to the AD group.
nested <- function(Y, x, ...) {
### Analysis of Variance (ANOVA)
x <- as.factor(x)
df <- cbind(Y, x, ...)
ncol <- ncol(Y)
covariates <- names(...)
F_tests <- furrr::future_map(df[, 1:ncol], ~ {
frm <- as.formula(paste0(".x ~", paste(covariates, collapse = "+")))
rm <- lm(frm, data = df)
ffm <- as.formula(paste0(".x ~ x +", paste(covariates, collapse = "+")))
fm <- lm(ffm, data = df)
anova(rm, fm)
})
# Correction for Multiple Testing
# Create a list to store the p-values
# Extract the p-values of the F-tests from the anova summaries list and store them in p_values
p_values <- F_tests %>%
future_map(~ .x[["Pr(>F)"]][[2]])
# Coerce p_values to dataframe and transpose it
p_values <- as.data.frame(p_values) %>%
t() %>%
data.frame(row.names = colnames(Y))
# Calculate the FDR-adjusted p-values
p_values$p_adj <- p.adjust(p_values[, ], method = "fdr", n = 230)
p_values <- round(p_values, 3)
# Filter out the non-significant (a=0.05) FDR-adjusted p-values
sig <- as.data.frame(dplyr::filter(p_values, `.` < 0.05))
sig <- sig[order(sig$p_adj), ]
names(sig) <- c("P(>F)", "FDR")
cat("Significant F-tests: \n")
print(sig)
cat("\nDose effects on metabolites:")
summaries <- furrr::future_map(df[, rownames(sig)], ~ {
f <- as.formula(paste0(".x ~ x +", paste(covariates, collapse = "+")))
mdl <- lm(f, data = df)
summary(mdl)$coefficients[1:length(table(x)), ]
})
summaries <- lapply(summaries, function(x) {
x <- x[, -c(2, 3)]
x <- t(x)
})
summaries <- plyr::ldply(summaries, id = 1:2) %>%
mutate(across(where(is.numeric), round, digits = 3))
coeffs <- summaries[c(TRUE, FALSE), ]
pvalues <- summaries[c(FALSE, TRUE), ]
t_tests <- cbind.data.frame(coeffs[, 1], coeffs[, 2], pvalues[, 2], coeffs[, 3], pvalues[, 3], coeffs[, 4], pvalues[, 4])
names(t_tests) <- c("Metabolite", "No ApoE4", "P(>t)", "ApoE4x1", "P(>t)", "ApoE4x2", "P(>t)")
return(t_tests)
}
Y <- df[, 1:230]
df$E4dose <- as.numeric(df$E4dose) - 1
df$E4dose[df$E4dose == 0] <- "No ApoE4"
df$E4dose[df$E4dose == 1] <- "1 ApoE4"
df$E4dose[df$E4dose == 2] <- "2 ApoE4"
E4dose <- df$E4dose <- as.factor(df$E4dose)
Among AD patients, ApoE4 dose seems to have positive effect on several triglycerides, diglycerides, putrescine, 2-ketoglutraric acid, lysophosphatidylcholin, HpH.PAF.-C16.0 and -C18.0
load("data/clinical.Rdata")
AD <- subset(df, Diagnosis == "Probable AD")
ADY <- AD[, 1:230]
ADE4dose <- AD$E4dose
ADclinical <- clinical[df$Diagnosis == "Probable AD", ]
## Testing for effects of ApoE4 dose
nested(ADY, relevel(ADE4dose, ref = "No ApoE4"), ADclinical)
Significant F-tests:
P(>F) FDR
Lip.TG.52.3. 0.001 0.156
Lip.TG.52.4. 0.003 0.156
Lip.DG.36.3. 0.002 0.156
OS.HpH.PAF.C16.0 0.002 0.156
Lip.TG.52.2. 0.006 0.255
Lip.TG.54.5. 0.010 0.373
Lip.TG.50.2. 0.012 0.398
Lip.TG.50.1. 0.018 0.450
Lip.TG.54.4. 0.020 0.450
Lip.TG.54.6. 0.017 0.450
Am.Putrescine 0.029 0.515
OA.OA09...2.ketoglutaric.acid 0.026 0.515
Lip.TG.50.3. 0.028 0.515
Lip.TG.52.5. 0.042 0.538
Lip.TG.56.7. 0.042 0.538
Lip.TG.56.8. 0.044 0.538
Lip.TG.58.9. 0.042 0.538
Lip.LPC.16.0. 0.033 0.538
OS.HpH.LPA.C18.0 0.044 0.538
Dose effects on metabolites:
Metabolite No ApoE4 P(>t) ApoE4x1 P(>t) ApoE4x2 P(>t)
1 Lip.TG.52.3. -1.443 0.818 2.568 0.004 3.719 0.001
2 Lip.TG.52.4. 0.620 0.888 1.628 0.008 2.392 0.002
3 Lip.DG.36.3. 0.026 0.631 0.019 0.012 0.031 0.001
4 OS.HpH.PAF.C16.0 34.751 0.000 2.593 0.017 4.645 0.001
5 Lip.TG.52.2. -5.080 0.469 2.124 0.030 3.744 0.002
6 Lip.TG.54.5. 1.863 0.496 0.928 0.016 1.280 0.006
7 Lip.TG.50.2. -4.499 0.297 0.882 0.141 2.193 0.003
8 Lip.TG.50.1. -2.324 0.539 0.706 0.179 1.837 0.005
9 Lip.TG.54.4. 3.457 0.318 1.178 0.015 1.369 0.020
10 Lip.TG.54.6. -0.434 0.795 0.515 0.027 0.746 0.009
11 Am.Putrescine 0.004 0.000 0.000 0.035 0.000 0.017
12 OA.OA09...2.ketoglutaric.acid 0.008 0.366 0.000 0.953 0.003 0.014
13 Lip.TG.50.3. -1.203 0.668 0.594 0.127 1.268 0.008
14 Lip.TG.52.5. 0.459 0.789 0.358 0.134 0.725 0.014
15 Lip.TG.56.7. -2.394 0.095 0.487 0.015 0.392 0.105
16 Lip.TG.56.8. -1.393 0.055 0.248 0.014 0.175 0.151
17 Lip.TG.58.9. -0.447 0.068 0.085 0.013 0.034 0.410
18 Lip.LPC.16.0. 4.220 0.028 0.312 0.237 0.850 0.009
19 OS.HpH.LPA.C18.0 0.498 0.038 0.044 0.178 0.101 0.013
Among individuals with SCD, lipid metabolites were not affected as much as in the AD group, with only two sphingomyelin species showing a difference. Aminoacids L-serine, tryptophan, glycine,trytptophan, L-homoserine, putrescine were affected in this group.
SCD <- df[df$Diagnosis == "Subjectieve klachten", ]
SCDY <- SCD[, 1:230]
SCDE4dose <- SCD$E4dose
SCDclinical <- clinical[df$Diagnosis == "Subjectieve klachten", ]
## Testing for effects of ApoE4 dose
nested(SCDY, relevel(SCDE4dose, ref = "No ApoE4"), SCDclinical)
Significant F-tests:
P(>F) FDR
Am.L.Serine 0.002 0.285
Am.L.Tryptophan 0.002 0.285
Am.Glycine 0.006 0.450
Am.L.homoserine 0.047 0.931
Am.Putrescine 0.045 0.931
Lip.SM.d18.1.22.0. 0.043 0.931
Lip.SM.d18.1.23.0. 0.029 0.931
Dose effects on metabolites:
Metabolite No ApoE4 P(>t) ApoE4x1 P(>t) ApoE4x2 P(>t)
1 Am.L.Serine 4.669 0.000 0.191 0.115 -1.058 0.003
2 Am.L.Tryptophan 3.747 0.004 -0.307 0.047 -1.364 0.002
3 Am.Glycine 4.332 0.001 0.382 0.015 -0.873 0.052
4 Am.L.homoserine 0.003 0.001 0.000 0.723 -0.001 0.014
5 Am.Putrescine 0.000 0.969 0.001 0.013 0.000 0.927
6 Lip.SM.d18.1.22.0. 3.337 0.002 0.309 0.015 0.272 0.448
7 Lip.SM.d18.1.23.0. 1.241 0.010 0.144 0.012 0.176 0.277
The discriminative potential of the metabolites, correcting for clinical background features, on AD and ApoE4 or not (4-class response ADE4, AD, SCDE4, SCD) is assessed by first fitting a Multi-nomial Logistic Regression (benchmark model). The benchmark model fits only the clinical features. The model’s AUC is compared with the same model (with altered hyperparameters) fitting clinical variables + 230 metabolites. The metabolites are then projected to 6 latent ML-estimated factors and 3 more models are fitted: Multi-nomial Logistic Regression, Decision Tree and XGBoost fitting clinical variables + 6 factors. The models’ multi-class classification performance is compared using confusion matrices and AUCs.
multifit <- function(X,
y,
model,
ctrl = NULL,
grid = NULL,
seed = 87654, ...) {
set.seed(seed)
# Merge X and y into df
df <- cbind.data.frame(X, y)
# Train the model
model <- caret::train(df[, 1:ncol(X)], df$y,
method = model,
tuneGrid = grid,
trControl = ctrl,
metric = "logLoss",
# importance=TRUE,
...
)
obs <- model$pred$obs
preds <- model$pred$pred
# Get the multi-class ROC curves
ys <- as.numeric(obs) - 1
yhats <- as.numeric(preds) - 1
roc <- multiclass.roc(
response = ys,
predictor = yhats, quiet = T, legacy.axes = T
)
print(roc$auc)
names <- c(
"ADE4 vs. AD", "ADE4 vs. SCDE4",
"ADE4 vs. SCD", "AD vs SCDE4", "AD vs. SCD", "SCDE4 vs. SCD"
)
for (i in 1:length(names)) {
names[[i]] <- paste0(
names[[i]], ", AUC=",
paste(round(auc(roc$rocs[[i]]), 2))
)
}
names(roc$rocs) <- names
cat("\n")
# Create a confusion matrix and get performance metrics from caret
cm <- confusionMatrix(reference = obs, data = preds, mode = "everything")
print(cm)
out <- list("model"= model, "roc"= roc)
return(out)
}
load("data/data.Rdata")
X <- scale(df[, 1:230])
y <- df$target
Xclin <- cbind.data.frame(X, clin_dummy)
ctrl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 1,
savePredictions = "final",
classProbs = T,
summaryFunction = multiClassSummary,
selectionFunction = best,
search = "random",
sampling = "smote"
)
benchmark <- multifit(
X = clin_dummy,
y = y,
model = "multinom",
ctrl = ctrl,
grid = expand.grid(decay = 0),
trace = F
)
Multi-class area under the curve: 0.8078
Confusion Matrix and Statistics
Reference
Prediction AD ADE4 SCD SCDE4
AD 47 22 0 2
ADE4 37 9 0 2
SCD 0 2 20 43
SCDE4 2 1 20 40
Overall Statistics
Accuracy : 0.4696
95% CI : (0.4061, 0.5339)
No Information Rate : 0.3522
P-Value [Acc > NIR] : 9.621e-05
Kappa : 0.284
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity 0.5465 0.26471 0.50000 0.4598
Specificity 0.8509 0.81690 0.78261 0.8562
Pos Pred Value 0.6620 0.18750 0.30769 0.6349
Neg Pred Value 0.7784 0.87437 0.89011 0.7446
Precision 0.6620 0.18750 0.30769 0.6349
Recall 0.5465 0.26471 0.50000 0.4598
F1 0.5987 0.21951 0.38095 0.5333
Prevalence 0.3482 0.13765 0.16194 0.3522
Detection Rate 0.1903 0.03644 0.08097 0.1619
Detection Prevalence 0.2874 0.19433 0.26316 0.2551
Balanced Accuracy 0.6987 0.54080 0.64130 0.6580
g <- ggroc(benchmark$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)
mlr <- multifit(
X = Xclin,
y = y,
model = "multinom",
ctrl = ctrl,
grid = expand.grid(decay = 9.187724),
trace = F,
importance=T
)
Multi-class area under the curve: 0.8323
Confusion Matrix and Statistics
Reference
Prediction AD ADE4 SCD SCDE4
AD 61 24 0 0
ADE4 25 9 0 1
SCD 0 0 17 30
SCDE4 0 1 23 56
Overall Statistics
Accuracy : 0.5789
95% CI : (0.5147, 0.6413)
No Information Rate : 0.3522
P-Value [Acc > NIR] : 3.308e-13
Kappa : 0.4118
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity 0.7093 0.26471 0.42500 0.6437
Specificity 0.8509 0.87793 0.85507 0.8500
Pos Pred Value 0.7176 0.25714 0.36170 0.7000
Neg Pred Value 0.8457 0.88208 0.88500 0.8144
Precision 0.7176 0.25714 0.36170 0.7000
Recall 0.7093 0.26471 0.42500 0.6437
F1 0.7135 0.26087 0.39080 0.6707
Prevalence 0.3482 0.13765 0.16194 0.3522
Detection Rate 0.2470 0.03644 0.06883 0.2267
Detection Prevalence 0.3441 0.14170 0.19028 0.3239
Balanced Accuracy 0.7801 0.57132 0.64004 0.7468
g <- ggroc(mlr$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)
# Get the feature importance scores
fimp <- varImp(mlr$model)
fimp$importance %>%
arrange(desc(Overall)) %>%
top_n(20)
Overall
Am.Putrescine 100.00000
OS.HpH.Spha.1.P.C18.0 93.04344
OA.OA29...Uracil 91.82792
Lip.PE.O.38.5. 90.87753
OA.OA17...3.Hydroxybutyric.acid 90.68479
Am.Sarcosine 86.83056
med_chol_nee 86.72834
Lip.TG.56.0. 85.98387
OS.HpH.LPA.C20.5 77.36226
Am.L.Histidine 75.27213
Am.Histamine 74.82199
Lip.PC.38.3. 74.12450
Am.Glutathione 72.74617
Lip.SM.d18.1.20.1. 72.52063
Am.Serotonine 70.53271
OA.OA13...Pyruvic.acid 70.27859
Am.L.Glutamine 69.94468
OS.HpH.LPA.C18.0 68.72490
OS.cHpH.aLPA.C16.1 66.13243
Am.L.Valine 65.82711
project <- function(X, m, seed) {
set.seed(seed)
X <- scale(as.matrix(df[, 1:230]))
cov <- cor(X)
# Find redundant features
filter <- RF(cov)
# Filter out redundant features
filtered <- subSet(X, filter)
# Regularized correlation matrix estimation
M <- regcor(filtered)
# Get the regularized correlation matrix of the filtered dataset
R <- M$optCor
mlfa <- mlFA(R, m = 6)
thomson <- facScore(filtered, mlfa$Loadings, mlfa$Uniqueness)
return(thomson)
}
thomson <- project(X, 6, seed = 1234)
X <- cbind(clin_dummy, thomson)
mlrf <- multifit(
X = X,
y = y,
model = "multinom",
ctrl = ctrl,
trace = FALSE,
grid = expand.grid(decay = 0)
)
Multi-class area under the curve: 0.8096
Confusion Matrix and Statistics
Reference
Prediction AD ADE4 SCD SCDE4
AD 51 22 0 1
ADE4 32 8 0 2
SCD 1 2 20 32
SCDE4 2 2 20 52
Overall Statistics
Accuracy : 0.5304
95% CI : (0.4661, 0.5939)
No Information Rate : 0.3522
P-Value [Acc > NIR] : 7.906e-09
Kappa : 0.3548
Mcnemar's Test P-Value : 0.2415
Statistics by Class:
Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity 0.5930 0.23529 0.50000 0.5977
Specificity 0.8571 0.84038 0.83092 0.8500
Pos Pred Value 0.6892 0.19048 0.36364 0.6842
Neg Pred Value 0.7977 0.87317 0.89583 0.7953
Precision 0.6892 0.19048 0.36364 0.6842
Recall 0.5930 0.23529 0.50000 0.5977
F1 0.6375 0.21053 0.42105 0.6380
Prevalence 0.3482 0.13765 0.16194 0.3522
Detection Rate 0.2065 0.03239 0.08097 0.2105
Detection Prevalence 0.2996 0.17004 0.22267 0.3077
Balanced Accuracy 0.7251 0.53783 0.66546 0.7239
g <- ggroc(mlrf$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)
tree <- multifit(
X = X,
y = y,
model = "rpart2",
ctrl = ctrl,
grid = expand.grid(maxdepth = 3)
)
Multi-class area under the curve: 0.8076
Confusion Matrix and Statistics
Reference
Prediction AD ADE4 SCD SCDE4
AD 55 17 0 1
ADE4 27 13 1 7
SCD 2 3 33 59
SCDE4 2 1 6 20
Overall Statistics
Accuracy : 0.4899
95% CI : (0.426, 0.554)
No Information Rate : 0.3522
P-Value [Acc > NIR] : 6.171e-06
Kappa : 0.3335
Mcnemar's Test P-Value : 1.011e-09
Statistics by Class:
Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity 0.6395 0.38235 0.8250 0.22989
Specificity 0.8882 0.83568 0.6908 0.94375
Pos Pred Value 0.7534 0.27083 0.3402 0.68966
Neg Pred Value 0.8218 0.89447 0.9533 0.69266
Precision 0.7534 0.27083 0.3402 0.68966
Recall 0.6395 0.38235 0.8250 0.22989
F1 0.6918 0.31707 0.4818 0.34483
Prevalence 0.3482 0.13765 0.1619 0.35223
Detection Rate 0.2227 0.05263 0.1336 0.08097
Detection Prevalence 0.2955 0.19433 0.3927 0.11741
Balanced Accuracy 0.7639 0.60902 0.7579 0.58682
g <- ggroc(tree$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)
xgb <- multifit(
X = X,
y = y,
model = "xgbTree",
ctrl = ctrl,
grid = xgb.grid
)
Multi-class area under the curve: 0.8383
Confusion Matrix and Statistics
Reference
Prediction AD ADE4 SCD SCDE4
AD 67 21 0 0
ADE4 18 11 1 0
SCD 1 0 19 37
SCDE4 0 2 20 50
Overall Statistics
Accuracy : 0.5951
95% CI : (0.5311, 0.6569)
No Information Rate : 0.3522
P-Value [Acc > NIR] : 6.845e-15
Kappa : 0.4371
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: AD Class: ADE4 Class: SCD Class: SCDE4
Sensitivity 0.7791 0.32353 0.47500 0.5747
Specificity 0.8696 0.91080 0.81643 0.8625
Pos Pred Value 0.7614 0.36667 0.33333 0.6944
Neg Pred Value 0.8805 0.89401 0.88947 0.7886
Precision 0.7614 0.36667 0.33333 0.6944
Recall 0.7791 0.32353 0.47500 0.5747
F1 0.7701 0.34375 0.39175 0.6289
Prevalence 0.3482 0.13765 0.16194 0.3522
Detection Rate 0.2713 0.04453 0.07692 0.2024
Detection Prevalence 0.3563 0.12146 0.23077 0.2915
Balanced Accuracy 0.8243 0.61716 0.64571 0.7186
g <- ggroc(xgb$roc$rocs) + scale_color_tableau() + theme_tufte() + guides(color = guide_legend(title = "ROC curves"))
plotly::ggplotly(g)
# cvms::plot_confusion_matrix(cmxgb, palette = "Oranges" , theme_fn = ggthemes::theme_tufte)
aucs <- data.frame(AUC = c("Clinical features only" = benchmark$roc$auc,
"Clinical features + 230 metabolites" = mlr$roc$auc,
"Clinical features + 6 latent factors" = mlrf$roc$auc,
"Decision Tree" = tree$roc$auc, "XGBoost" = xgb$roc$auc))
aucs
AUC
Clinical features only 0.8078106
Clinical features + 230 metabolites 0.8322980
Clinical features + 6 latent factors 0.8095760
Decision Tree 0.8075950
XGBoost 0.8382963
Observations:
Adding serum metabolite information (either the full 230-metabolite matrix or its 6-factor projection) seems to increase the discriminatory power of the models.
Fitting 6 ML-estimated factors obtained by the FMradio package (cummulatively explaining 30% of variace) yields increased classification performance, serving as a valuable dimension reduction technique for high-dimensional data.
Looking at the confusion matrix and individual ROC curves, all models were able to discriminate better among certain classes (AD+E4/SCD+E4, AD+E4/SCD, AD-E4/SCD+E4 and AD-E4/SCD-E4) compared to others (AD+E4/AD-E4 and SCD+E4/SCD-E4).
The covariance network analysis shows distinct metabolic “images” among ApoE4 carriers and non-carriers in AD. The network vertices and edges are distinct.
# Store all observations of ApoE4 non-carriers in C1
C1 <- scale(AD[AD$E4 == 0, 1:230])
# Store all observations of ApoE4 carriers in C2
C2 <- scale(AD[AD$E4 == 1, 1:230])
# Get the covariance matrices of C1 and C2
S1 <- covML(C1)
S2 <- covML(C2)
# Store them in a list
S <- list(S1 = S1, S2 = S2)
# Get the total number of samples
n <- c(nrow(S1), nrow(S2))
# Create a list of fused covariance matrices T
Ts <- default.target.fused(Slist = S, ns = n, type = "DUPV")
# Get the optimal lambdas per class and fused with 10-fold CV
# set.seed(8910)
# optf <- optPenalty.fused(
# Ylist = Ys,
# Tlist = Ts,
# lambda = default.penalty(Ys),
# cv.method = "kCV",
# k = 10,
# verbose = FALSE
# )
# save(optf, file= "data/optf.Rdata")
i = 1 | max diff = 8.0963463215e-01
i = 2 | max diff = 7.3872853063e-29
Converged in 2 iterations, max diff < 1.49e-08.
Step 1... determine cutoff point
Step 2... estimate parameters of null distribution and eta0
Step 3... compute p-values and estimate empirical PDF/CDF
Step 4... compute q-values and local fdr
Step 5... prepare for plotting
- Retained elements: 55
- Corresponding to 0.21 % of possible edges
Step 1... determine cutoff point
Step 2... estimate parameters of null distribution and eta0
Step 3... compute p-values and estimate empirical PDF/CDF
Step 4... compute q-values and local fdr
Step 5... prepare for plotting
- Retained elements: 205
- Corresponding to 0.78 % of possible edges
GGMs showing distinct metabolic compositions among ApoE4 carriers (left), non-carriers(middle) and the differential edges (right).
# Merge the sparse high precision matrices
TST <- Union(P0s$S1$sparseParCor, P0s$S2$sparseParCor)
PCE4NO <- TST$M1subset
PCE4YES <- TST$M2subset
# Create a color map per metabolite class
Colors <- rownames(PCE4YES)
Colors[grep("Am", rownames(PCE4YES))] <- "pink"
Colors[grep("OA", rownames(PCE4YES))] <- "lightblue"
Colors[grep("Lip", rownames(PCE4YES))] <- "yellow"
Colors[grep("OS", rownames(PCE4YES))] <- "grey"
# httpgd::hgd( width = getOption("httpgd.width", 700),
# height = getOption("httpgd.height", 700),)
set.seed(111213)
# Plot he sparsified ridge matrix of ApoE4 carriers with AD
Coords <- Ugraph(PCE4YES,
type = "fancy", lay = "layout_with_fr",
Vcolor = Colors, prune = FALSE, Vsize = 5, Vcex = 0.3, cut = 0.5,
main = "ApoE4 carriers with AD"
)
# plot(NULL, xaxt = "n", yaxt = "n", bty = "n", ylab = "", xlab = "", xlim = 0:1, ylim = 0:1)
# legend("topleft",
# legend = c(
# "Amines, aminoacids", "Organic Acids", "Lipids",
# "Oxidative stress compounds"
# ), pch = 16, pt.cex = 3, cex = 1.5, bty = "n",
# col = c("pink", "lightblue", "yellow", "grey")
# )
# names <- c("ApoE4 non-carrier edges", "ApoE4 carrier edges", "positive partial cor.", "negative partial cor.")
# clrs <- c("red", "green", "black", "black")
# ltype <- c(1, 1, 1, 2)
# plot(NULL, xaxt = "n", yaxt = "n", bty = "n", ylab = "", xlab = "", xlim = 0:1, ylim = 0:1)
# legend("topleft",
# title = "", legend = names, lty = ltype, lwd = 2, cex = 1.5,
# bty = "n", col = clrs
# )
# Plot the sparsified ridge matrix of ApoE4 non-carriers with AD
Ugraph(PCE4NO,
type = "fancy", lay = NULL, coords = Coords,
Vcolor = Colors, prune = FALSE, Vsize = 5, Vcex = 0.3, cut = 0.5,
main = "ApoE4 non-carriers with AD"
)
# Plot the differential network
diffgraph <- DiffGraph(PCE4NO, PCE4YES,
lay = NULL, coords = Coords,
Vcolor = Colors, Vsize = 5, Vcex = 0.3, main = "Differential Network"
)
Distinct metabolite communities among ApoE4 carriers/non carriers
# Get the communities per class
set.seed(141516)
CommC1 <- Communities(PCE4NO,
Vcolor = Colors, Vsize = 5, Vcex = 0.3,
main = "ApoE4 non-carriers with AD"
)
CommC2 <- Communities(PCE4YES,
Vcolor = Colors, Vsize = 5, Vcex = 0.3,
main = "ApoE4 carriers with AD"
)
PC0list <- list(PCE4NO = PCE4NO, PCE4YES = PCE4YES)
# Get the network statistics
NetStats <- GGMnetworkStats.fused(PC0list)
NetStatsE4yes <- NetStats[, 10:18]
NetStatsE4no <- NetStats[, 1:9]
NetStatsE4yes[, c(3, 9)] <- NetStatsE4no[, c(3, 9)] <- NULL
DegreesE4yes <- as.data.frame(NetStatsE4yes[, 1], rownames(NetStats))
DegreesE4no <- as.data.frame(NetStatsE4no[, 1], row.names = row.names(NetStatsE4no))
head(DegreesE4yes[order(DegreesE4yes[, 1], decreasing = TRUE), , drop = FALSE], 20)
NetStatsE4yes[, 1]
Am.Dopamine 7
Am.ADMA 5
Am.Citrulline 5
Am.L.Kynurenine 5
OA.OA27...3.hydroxyisovaleric.acid 5
OS.HpH.PAF.C16.0 5
Am.X1.Methylhistidine 4
Am.X3.Methoxytyramine 4
Am.DL.3.aminoisobutyric.acid 4
Am.Ethanolamine 4
Am.L.2.aminoadipic.acid 4
Am.L.4.hydroxy.proline 4
Am.SDMA 4
OA.OA06...Glycolic.acid 4
OA.OA09...2.ketoglutaric.acid 4
OA.OA12...Fumaric.acid 4
OA.OA17...3.Hydroxybutyric.acid 4
OA.OA20...Aspartic.acid 4
OA.OA28...Glyceric.acid 4
Lip.TG.56.8. 4
head(DegreesE4no[order(DegreesE4no[, 1], decreasing = TRUE), , drop = FALSE], 20)
NetStatsE4no[, 1]
Am.X3.Methoxytyrosine 5
OS.LpH.NO2.OA 4
Am.Cysteine 3
Am.Dopamine 3
Am.Glutathione 3
Am.L.Aspartic.acid 3
Am.X3.Methylhistidine 2
Am.Hydroxylysine 2
Am.L.4.hydroxy.proline 2
Am.L.carnosine 2
Am.Methyldopa 2
OA.OA03...Citric.acid 2
OA.OA13...Pyruvic.acid 2
OA.OA20...Aspartic.acid 2
OA.OA23...Iminodiacetate 2
OA.OA29...Uracil 2
Lip.TG.60.2. 2
Lip.SM.d18.1.24.0. 2
OS.LpH.NO2.LA 2
OS.LpH.NO2.aLA 2
Wilcoxon Signed Rank test between the network statistics of ApoE4 carriers and non-carriers. The test shows distinct centrality degrees and average number of negative and positive edges.
# Perform a Wilcoxon signed rank test
w <- furrr::future_map2(NetStatsE4yes[, 1:7], NetStatsE4no[, 1:7], ~ {
wilcox.test(.x, .y, paired = TRUE, alternative = "greater")
})
p.values <- names <- list(1:length(w))
for (i in 1:length(w)) {
names[[i]] <- names(w[[i]])[7:length(names(w[[i]]))]
p.values[[i]] <- w[[i]][["p.value"]]
}
names(p.values) <- names(w)
d <- as.data.frame(p.values)
d <- t(d)
d <- cbind.data.frame(d, p.adjust(d, method = "fdr"))
rownames(d) <- sub("PCE4YES.", "", row.names(d))
names(d) <- c("p-value", "FDR")
d
p-value FDR
degree 7.905289e-27 1.844567e-26
betweenness 5.439808e-18 6.346442e-18
eigenCentrality 3.626367e-25 6.346142e-25
nNeg 8.837198e-06 8.837198e-06
nPos 1.436859e-24 2.011603e-24
mutualInfo 1.773939e-30 6.411006e-30
variance 1.831716e-30 6.411006e-30